############################################################################################################################
########Inclusion, recognition, and inter-group comparisons: The effects of power-sharing institutions on grievances########
#####Group-level measurement approach (appendix 5)#####
#####Andreas Juon, 2023 (Journal of Conflict Research)#####
############################################################################################################################

###adapted from the script by Christopher Claassen 2018 (Christopher Claassen. 2018. "Estimating Smooth Panels of Public Opinion". Political Analysis (2018-02-15))
###note: to run, the file supdem.stan.mod1.stan is needed, available from Claassen's replication data: https://doi.org/10.7910/DVN/A47LUM
###for the code to work, download this file and put it in the same working directory that contains the data files

####################################################################################
#########STEP 1: LIBRARIES, DATA IMPORT, DEFINITIONS#########
####################################################################################

setwd("XXXXX")#set to directory that contains the data files

#######1.1 Libraries#######
library(MASS)
library(psych)
library(arm)
library(dplyr)
library(tidyr)
library(rstan)
library(loo)
library(rstanarm)
library(ggplot2)
library(RColorBrewer)

#######1.3 Defining variable lists#######
main1 <- c("state_control","included_nc")
main2 <- c("state_control","state_control * ps_corp","state_control * ps_lib")
main3 <- c("state_control","state_control * ps_corp","state_control * ps_lib","state_control * ps_corp_all_diff_neg")
main4 <- c("state_control","state_control * ps_corp","state_control * ps_lib","state_control * tek_ps_corp_diff_neg")
controls_gc <- c("size","other","ongoing_recent_contestation","cpi","politya","lgdppc","fractionalization1","region")
controls_i <- c("age","gender","educ_high","interest_pol_d")

#######1.4 Reading Data#######
grievances_ind <- read.csv(file="./juon_jcr_grievances_data_ind.csv")
group_data_1992_2018 <- read.csv("./juon_jcr_grievances_data_group_1992_2018.csv")

#######1.5 Stan options#######
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores()-2)

######1.6 Formula for combining ggplot legends######
grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) {
  plots <- list(...)
  position <- match.arg(position)
  g <- ggplotGrob(plots[[1]] + 
                    theme(legend.position = position))$grobs
  legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
  lheight <- sum(legend$height)
  lwidth <- sum(legend$width)
  gl <- lapply(plots, function(x) x +
                 theme(legend.position = "none"))
  gl <- c(gl, ncol = ncol, nrow = nrow)
  
  combined <- switch(position,
                     "bottom" = arrangeGrob(do.call(arrangeGrob, gl), 
                                            legend,ncol = 1,
                                            heights = unit.c(unit(1, "npc") - lheight, lheight)),
                     "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                           legend, ncol = 2,
                                           widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
  
  grid.newpage()
  grid.draw(combined)
  
  invisible(combined)
}

######1.7 Formula for clustering SE's######
get_confint<-function(model, vcovCL){
  t<-qt(.975, model$df.residual)
  ct<-coeftest(model, vcovCL)
  est<-cbind(ct[,1], ct[,1]-t*ct[,2], ct[,1]+t*ct[,2])
  colnames(est)<-c("Estimate","LowerCI","UpperCI")
  return(est)
}
cluster.se<-function(model, cluster)
{
  require(multiwayvcov)
  require(lmtest)
  vcovCL<-cluster.vcov(model, cluster)
  coef<-coeftest(model, vcovCL)
  return(coef)
}


####################################################################################
#########STEP 2: SETTING UP DATA#########
####################################################################################

######2.1 All dependent variable types, all surveys######
surveygroup_base <- subset(grievances_ind, sample_unsat == 1 | sample_disc == 1)
for (j in c("disc_grp", controls_i)) {
  step1 <- paste("surveygroup_base <- surveygroup_base %>% group_by(gwgroupid,year) %>%  mutate(", j, "_mean = mean(",j,",na.rm=T))",sep="")
  eval(parse(text=c(step1)))
}
surveygroup_base$one <- 1
surveygroup_base <- surveygroup_base %>% group_by(gwgroupid,survey,surveywave) %>%  mutate(group_s_number = sum(one,na.rm=T))
surveygroup_base <- subset(surveygroup_base, group_s_number >= 50)
controls_i_df <- unique(surveygroup_base[c("year","gwgroupid","age_mean","gender_mean","educ_high_mean","interest_pol_d_mean")])
surveygroup_base <- surveygroup_base[c("cowcode","gwgroupid","year","survey", "unsat_cab", "unsat_hos", "unsat_parl", "disc_grp")]

######2.2 Constructing averages per survey######
surveygroup_base <- surveygroup_base[c("cowcode","gwgroupid","year","survey", "unsat_cab", "unsat_hos", "unsat_parl", "disc_grp")]
surveygroup_base <- gather(surveygroup_base, variable, measurement, unsat_cab:disc_grp, factor_key=TRUE)
surveygroup_base <- subset(surveygroup_base, !is.na(measurement))
surveygroup_base$one <- 1
surveygroup_base <- surveygroup_base %>% group_by(gwgroupid,survey,variable,year) %>% mutate(survey_sample = sum(one,na.rm=F))
surveygroup_base <- surveygroup_base %>% group_by(gwgroupid,survey,variable,year) %>% mutate(response_number = sum(measurement,na.rm=F))
surveygroup_base <- unique(surveygroup_base[c("cowcode","gwgroupid","year","survey", "variable","survey_sample","response_number")])
surveygroup_base2 <- spread(surveygroup_base, variable, response_number)
for (i in levels(surveygroup_base2$survey)) {
    step1 <- paste("surveygroup_base2$",i,"_sample <- ifelse(surveygroup_base2$survey == '",i,"', surveygroup_base2$survey_sample, NA)",sep="")
    eval(parse(text=c(step1)))
}
surveygroup_base3 <- unique(surveygroup_base2)#10376

######2.3 Split and stack datasets and creating indicators######
names(surveygroup_base3)
sddem1 = gather(data=surveygroup_base3, key="Item", value="Response", 6:9, na.rm=TRUE) 
sddem1 <- subset(sddem1, Response > 0)#!!!
sddem1 = unite(sddem1, ItemProj, c(survey, Item), sep = "_", remove = FALSE)
sddem1$ItemProj = as.factor(sddem1$ItemProj)
year0 = 1992-1
sddem1 = sddem1[sddem1$year >= year0,]
sddem1 = unite(sddem1, year_gwgroupid, c(year, gwgroupid), sep = "_", remove = FALSE)
sddem1$year_gwgroupid = as.factor(sddem1$year_gwgroupid)
sddem1 = unite(sddem1, ItemProjgwgroupid, c(ItemProj, gwgroupid), sep = "_", remove = FALSE)
sddem1$ItemProjgwgroupid = as.factor(sddem1$ItemProjgwgroupid)
sddem1 = unite(sddem1, Itemgwgroupid, c(Item, gwgroupid), sep = "_", remove = FALSE)
sddem1$Itemgwgroupid = as.factor(sddem1$Itemgwgroupid)
sddem1 = unite(sddem1, YrProj, c(year, survey), sep = "_", remove = FALSE)
sddem1$YrProj = as.factor(sddem1$YrProj)
sddem1 = unite(sddem1, YrProjgwgroupid, c(YrProj, gwgroupid), sep = "_", remove = FALSE)
sddem1$YrProjgwgroupid = as.factor(sddem1$YrProjgwgroupid)
sddem1$gwgroupid = as.factor(as.character(sddem1$gwgroupid))
sddem1$Item = as.factor(as.character(sddem1$Item))
sddem1$ItemProj = as.factor(as.character(sddem1$ItemProj))
sddem1$Itemgwgroupid = as.factor(as.character(sddem1$Itemgwgroupid))
sddem1$ItemProjgwgroupid = as.factor(as.character(sddem1$ItemProjgwgroupid))
sddem1$survey = as.factor(as.character(sddem1$survey))
sddem1$YrProj = as.factor(as.character(sddem1$YrProj))
sddem1$YrProjgwgroupid = as.factor(as.character(sddem1$YrProjgwgroupid))
sddem1$year = as.factor(sddem1$year-year0)
length(unique(sddem1$gwgroupid))
length(unique(sddem1$survey))
length(unique(sddem1$Itemgwgroupid))
length(unique(sddem1$YrProjgwgroupid))
cnt.obs.years = rowSums(table(sddem1$gwgroupid, sddem1$year) > 0)
sddem2 = sddem1[sddem1$gwgroupid %in% levels(sddem1$gwgroupid)[cnt.obs.years > 1], ]


####################################################################################
#########STEP 3: STAN ESTIMATION#########
####################################################################################

######3.1 Prepare data for stan######
n.items = length(unique(sddem2$Item))
n.cntrys = length(unique(sddem2$gwgroupid))
n.yrs = 2018-year0
n.proj = length(unique(sddem2$survey))
n.resp = dim(sddem2)[1]
n.itm.prj = length(unique(sddem2$Item))#!!!!
n.itm.cnt = length(unique(sddem2$Itemgwgroupid))
n.cntry.yrs = n.cntrys * n.yrs
n.itm.prj.cnt = length(unique(sddem2$ItemProjgwgroupid))
n.yr.proj.cnt = length(unique(sddem2$YrProjgwgroupid))
cntrys = as.numeric(factor(sddem2$gwgroupid))
cnt.names = sort(unique(sddem2$gwgroupid))
cnt.ccode = sddem2[match(cnt.names, sddem2$gwgroupid), "gwgroupid"]
items = as.numeric(factor(sddem2$Item))
yrs = as.numeric(sddem2$year)
projs = as.numeric(factor(sddem2$survey))
itm.prjs = as.numeric(factor(sddem2$Item))
itm.prj.cnts = as.numeric(factor(sddem2$ItemProjgwgroupid))
itm.cnts = as.numeric(factor(sddem2$Itemgwgroupid))
cntry.yrs = as.numeric(sddem2$year_gwgroupid)

######3.2 Specify data for stan######
dat.1 = list(N=n.resp, K=n.itm.prj, T=n.yrs, J=n.cntrys, jj=cntrys, tt=yrs, kk=itm.prjs, 
             x=sddem2$Response, samp=sddem2$survey_sample)
dat.2 = list(N=n.resp, K=n.itm.prj, T=n.yrs, J=n.cntrys, P=n.itm.prj.cnt, jj=cntrys, tt=yrs, 
             pp=itm.prj.cnts, kk=itm.prjs, x=sddem2$Response, samp=sddem2$survey_sample)
sapply(list(dat.1, dat.2), summary)

######3.3 Parameters######
pars.1 = c("mu_lambda","sigma_lambda","sigma_theta","lambda","theta","x_pred","log_lik")
n.iter = 1000
n.warm = 500
n.chn = 4
n.thin = 2

######3.4 Model######
#note: file supdem.stan.mod1.stan is needed (see lines 7-9 for information on where to download it from)
stan.mod.1 = stan(file='supdem.stan.mod1.stan', data=dat.1, pars=pars.1, 
                  iter=n.iter, warmup=n.warm, chains=n.chn, thin=n.thin, 
                  control=list(adapt_delta=0.99, stepsize=0.005, max_treedepth=14))

######3.5 Evaluate model fit######
model.list = list(stan.mod.1)
table_a19 = matrix(NA, nrow=5, ncol=3)
colnames(table_a19) = c("MAE_int", "pc_improv_int", "LOO-IC")
row.names(table_a19) = c("Mod1", "DGIRT", "cnt_means", "item_means", "grnd_means")
llik = lapply(list(stan.mod.1), extract_log_lik)
(loo.list = sapply(llik, loo))
table_a19[1,3] = paste(loo.list[7])
x.sim = rstan::extract(stan.mod.1, pars=c("x_pred"))
x.sim.pe = sapply(x.sim, function(x) apply(x, 2, mean, na.rm=TRUE)) 
x.sim.err = x.sim.pe
x.sim.err = apply(x.sim.pe, 2, function(x) sddem2$Response/sddem2$survey_sample - x/sddem2$survey_sample)
(mae = apply(x.sim.err, 2, function(x) sum(abs(x), na.rm=TRUE) / length(sddem2$Response)))
table_a19[1,1] = round(mae, 3)
sddem2$gwgroupidMean = ave(sddem2$Response/sddem2$survey_sample, sddem2$gwgroupid)
cnt.mae = sum(abs(sddem2$gwgroupidMean - sddem2$Response/sddem2$survey_sample), na.rm=TRUE) / length(sddem2$Response)
table_a19[3,1] = round(cnt.mae, 3)
table_a19[1,2] = round((cnt.mae - mae) / cnt.mae * 100, 1)
sddem2$ItemMean = ave(sddem2$Response/sddem2$survey_sample, sddem2$ItemProj)
itm.mae = sum(abs(sddem2$ItemMean - sddem2$Response/sddem2$survey_sample), na.rm=TRUE) / length(sddem2$Response)
table_a19[4,1] = round(itm.mae, 3)
table_a19[4,2] = round((cnt.mae - itm.mae) / cnt.mae * 100, 1)
grd.mae = sum(abs(mean(sddem2$Response/sddem2$survey_sample) - sddem2$Response/sddem2$survey_sample), na.rm=TRUE) / 
  length(sddem2$Response)
table_a19[5,1] = round(grd.mae, 3)
table_a19[5,2] = round((cnt.mae - grd.mae) / cnt.mae * 100, 1)

######3.6 Extract theta estimates######
summary(stan.mod.1, pars=c("theta"))$summary[,"mean"][
  c(1, n.cntrys+1, 2*n.cntrys+1, 3*n.cntrys+1, 4*n.cntrys+1, 5*n.cntrys+1)]
theta.out = rstan::extract(stan.mod.1, pars = c("theta"))[[1]]
theta.std = (theta.out - mean(as.vector(theta.out))) / sd(as.vector(theta.out))
theta.out.t = apply( theta.std, 1, function(x) t(x) )
theta.out.df = data.frame(gwgroupid=rep(cnt.names, length.out=n.cntrys*27), 
                          year=rep(1992:2018, each=n.cntrys), theta.out.t)
theta.pe = theta.out.df[,1:2]
theta.pe$theta = apply(theta.out.df[,2:n.iter+2], 1, mean)
theta.pe$theta_sd = apply(theta.out.df[,2:n.iter+2], 1, sd)
first.yr = data.frame(gwgroupid=levels(sddem2$gwgroupid), First_yr = 
                        as.vector(by(sddem2, sddem2$gwgroupid, function(x) min(as.numeric(x$year))+1992)))
theta.pe = merge(theta.pe, first.yr, by="gwgroupid", all.x=TRUE)


####################################################################################
#########STEP 4: EXAMPLE PLOTS#########
####################################################################################

######4.1 Extract and standardize data######
theta.out = rstan::extract(stan.mod.1, pars = c("theta"))[[1]]
dim(theta.out)
theta.std = (theta.out - mean(as.vector(theta.out))) / sd(theta.out)
theta.pe_new = apply(theta.std, c(2,3), mean)
colnames(theta.pe_new) = cnt.names
theta.samp.200 = theta.std[sample(1:dim(theta.std)[1], n.iter/5), , ]
sddem2$PropResp = sddem2$Response / sddem2$survey_sample
itm.means = as.vector(by(sddem2$PropResp, sddem2$ItemProj, mean))
sddem2$PropRespStd = NA
for(i in 1:length(itm.means)) {
  sddem2$PropRespStd[itm.prjs==i] = sddem2$PropResp[itm.prjs==i] - itm.means[i]
}
sddem2$PropRespStd = sddem2$PropRespStd / sd(sddem2$PropRespStd)

######4.2 Plots of Russian groups######
###a) Russians in Russia
plot.cnt = c('36501000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot1 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '36501000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###b) Russians in Estonia
plot.cnt = c('36602000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot2 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '36602000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###c) Russians in Latvia
plot.cnt = c('36702000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot3 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '36702000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###d) Russians in Lithuania
plot.cnt = c('36803000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot4 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '36803000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###e) Russians in Ukraine
plot.cnt = c('36902000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot5 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '36902000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###f) Russians in Belarus
plot.cnt = c('37002000')
cnt.raw = sddem2[sddem2$gwgroupid == plot.cnt[1],]
cnt.raw$year = as.numeric(cnt.raw$year) + 1992
full_plot <- "ggplot() +  geom_line(data = data.frame(yvalue=theta.pe_new[,plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =1)+
geom_point(data = cnt.raw, aes(x=year, y=PropRespStd, size = survey_sample/100), alpha=0.5)"
for(j in 1:dim(theta.samp.200)[1]) {
  full_plot <- paste(full_plot, "+ geom_line(data = data.frame(yvalue=theta.samp.200[",j,",,cnt.names==plot.cnt[1]], xvalue=seq(1992,2018)), aes(x=xvalue, y=yvalue), size =0.25, alpha = 0.25)", sep="")
}
plot6 <- eval(parse(text=paste(full_plot, " + geom_ribbon(data=subset(theta.pe, gwgroupid == '37002000'), aes(x=year, ymin = theta - 1.96* theta_sd, ymax = theta + 1.96*  theta_sd), fill = 'grey', alpha = 0.2) + theme_bw() + theme(text=element_text(family='Times'), legend.position='bottom') + xlab('year') + ylab('latent grievance') + guides(size=guide_legend(title='Sample (100s)'))")))
###g) combine into figure A7
figure_a7 <- grid_arrange_shared_legend(plot1 + ggtitle("Russia") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)) ,plot2 + ggtitle("Estonia") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)),plot3 + ggtitle("Latvia") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)),plot4 + ggtitle("Lithuania") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)),plot5 + ggtitle("Ukraine") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)),plot6 + ggtitle("Belarus") + coord_cartesian(xlim = c(1992,2018), ylim=c(-2,3)), ncol=2,nrow=3)


####################################################################################
#########STEP 5: MODELS#########
####################################################################################

######5.1 Joining together data: Theta, group-level independent variables, individual-level controls######
group_data_1992_2018$gwgroupid <- as.factor(group_data_1992_2018$gwgroupid)
group_analysis <- left_join(theta.pe, group_data_1992_2018, by=c("gwgroupid","year"))
group_analysis <- left_join(group_analysis, controls_i_df, by=c("gwgroupid","year"))

######5.2 Models######
m1_theta_group <- lm(as.formula(paste("theta", paste(c(main1,controls_gc,"age_mean","gender_mean","educ_high_mean", "interest_pol_d_mean"), collapse = " + "), sep = " ~ ")), data = group_analysis)
cl_m1_theta_group <- data.frame(cluster.se(m1_theta_group, as.integer(group_analysis$cowcode))[, 2])
m2_theta_group <- lm(as.formula(paste("theta", paste(c(main2,controls_gc,"age_mean","gender_mean","educ_high_mean", "interest_pol_d_mean"), collapse = " + "), sep = " ~ ")), data = group_analysis)
cl_m2_theta_group <- data.frame(cluster.se(m2_theta_group, as.integer(group_analysis$cowcode))[, 2])
m3_theta_group <- lm(as.formula(paste("theta", paste(c(main3,controls_gc,"age_mean","gender_mean","educ_high_mean", "interest_pol_d_mean"), collapse = " + "), sep = " ~ ")), data = group_analysis)
cl_m3_theta_group <- data.frame(cluster.se(m3_theta_group, as.integer(group_analysis$cowcode))[, 2])
m4_theta_group <- lm(as.formula(paste("theta", paste(c(main4,controls_gc,"age_mean","gender_mean","educ_high_mean", "interest_pol_d_mean"), collapse = " + "), sep = " ~ ")), data = group_analysis)
cl_m4_theta_group <- data.frame(cluster.se(m4_theta_group, as.integer(group_analysis$cowcode))[, 2])
stargazer(m1_theta_group, m2_theta_group, m3_theta_group, m4_theta_group, se = c(cl_m1_theta_group, cl_m2_theta_group, cl_m3_theta_group, cl_m4_theta_group), type="text", style = "ajps", omit = c("survey","region"), dep.var.labels = c("Theta", "Theta", "Theta", "Theta"), notes = c("* p<0.1; ** p<0.05; *** p<0.01. Survey- and region-fixed effects included but not reported. Country-clustered standard errors in parentheses."), notes.append=FALSE, covariate.labels = c("Core","Included non-core","Corporate PSI (group)","Liberal PSI (group)","Corporate PSI (difference, domestic)","Corporate PSI (difference, TEK)","Relative size","non-EPR group","Recent contestation","CPI","Polity (normalized)","GDP p.c. (logged)","Ethnic fractionalization","Age mean","Female mean","High education mean","Political interest mean","Core x corporate PSI (group)","Core x liberal PSI (group)","Core x corporate PSI (difference, domestic)","Core x corporate PSI (difference, TEK)"))


